{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "60-机器翻译与数据集.ipynb",
      "provenance": [],
      "collapsed_sections": []
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "#机器翻译与数据集\n",
        "机器翻译指的是将序列从一种语言自动翻译成另一种语言。"
      ],
      "metadata": {
        "id": "8TduQRbKaAnl"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "%tensorflow_version 2.x\n",
        "import tensorflow as tf\n",
        "device_name = tf.test.gpu_device_name()\n",
        "if device_name != '/device:GPU:0':\n",
        "  raise SystemError('GPU device not found')\n",
        "print('Found GPU at: {}'.format(device_name))"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "PTvbH-DneRvs",
        "outputId": "f03dfe22-db59-4a6f-b54b-169080869755"
      },
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Found GPU at: /device:GPU:0\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!pip3 install torch torchvision torchaudio"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "7u9bteZce0R-",
        "outputId": "870f6238-ec9c-4175-f572-c4e0eab1d94e"
      },
      "execution_count": 8,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Requirement already satisfied: torch in /usr/local/lib/python3.7/dist-packages (1.10.0+cu111)\n",
            "Requirement already satisfied: torchvision in /usr/local/lib/python3.7/dist-packages (0.11.1+cu111)\n",
            "Requirement already satisfied: torchaudio in /usr/local/lib/python3.7/dist-packages (0.10.0+cu111)\n",
            "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from torch) (3.10.0.2)\n",
            "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from torchvision) (1.21.5)\n",
            "Requirement already satisfied: pillow!=8.3.0,>=5.3.0 in /usr/local/lib/python3.7/dist-packages (from torchvision) (7.1.2)\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "!pip3 install d2l==0.14"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Od7vqKAJe2TS",
        "outputId": "ba1d1f81-5881-40b0-f54b-8a23814f0b7d"
      },
      "execution_count": 9,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Collecting d2l==0.14\n",
            "  Downloading d2l-0.14.0-py3-none-any.whl (48 kB)\n",
            "\u001b[?25l\r\u001b[K     |██████▊                         | 10 kB 20.0 MB/s eta 0:00:01\r\u001b[K     |█████████████▍                  | 20 kB 22.9 MB/s eta 0:00:01\r\u001b[K     |████████████████████            | 30 kB 10.8 MB/s eta 0:00:01\r\u001b[K     |██████████████████████████▉     | 40 kB 8.7 MB/s eta 0:00:01\r\u001b[K     |████████████████████████████████| 48 kB 2.6 MB/s \n",
            "\u001b[?25hRequirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from d2l==0.14) (1.21.5)\n",
            "Requirement already satisfied: matplotlib in /usr/local/lib/python3.7/dist-packages (from d2l==0.14) (3.2.2)\n",
            "Requirement already satisfied: jupyter in /usr/local/lib/python3.7/dist-packages (from d2l==0.14) (1.0.0)\n",
            "Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from d2l==0.14) (1.3.5)\n",
            "Requirement already satisfied: nbconvert in /usr/local/lib/python3.7/dist-packages (from jupyter->d2l==0.14) (5.6.1)\n",
            "Requirement already satisfied: qtconsole in /usr/local/lib/python3.7/dist-packages (from jupyter->d2l==0.14) (5.2.2)\n",
            "Requirement already satisfied: ipywidgets in /usr/local/lib/python3.7/dist-packages (from jupyter->d2l==0.14) (7.6.5)\n",
            "Requirement already satisfied: jupyter-console in /usr/local/lib/python3.7/dist-packages (from jupyter->d2l==0.14) (5.2.0)\n",
            "Requirement already satisfied: ipykernel in /usr/local/lib/python3.7/dist-packages (from jupyter->d2l==0.14) (4.10.1)\n",
            "Requirement already satisfied: notebook in /usr/local/lib/python3.7/dist-packages (from jupyter->d2l==0.14) (5.3.1)\n",
            "Requirement already satisfied: traitlets>=4.1.0 in /usr/local/lib/python3.7/dist-packages (from ipykernel->jupyter->d2l==0.14) (5.1.1)\n",
            "Requirement already satisfied: ipython>=4.0.0 in /usr/local/lib/python3.7/dist-packages (from ipykernel->jupyter->d2l==0.14) (5.5.0)\n",
            "Requirement already satisfied: jupyter-client in /usr/local/lib/python3.7/dist-packages (from ipykernel->jupyter->d2l==0.14) (5.3.5)\n",
            "Requirement already satisfied: tornado>=4.0 in /usr/local/lib/python3.7/dist-packages (from ipykernel->jupyter->d2l==0.14) (5.1.1)\n",
            "Requirement already satisfied: decorator in /usr/local/lib/python3.7/dist-packages (from ipython>=4.0.0->ipykernel->jupyter->d2l==0.14) (4.4.2)\n",
            "Requirement already satisfied: pygments in /usr/local/lib/python3.7/dist-packages (from ipython>=4.0.0->ipykernel->jupyter->d2l==0.14) (2.6.1)\n",
            "Requirement already satisfied: simplegeneric>0.8 in /usr/local/lib/python3.7/dist-packages (from ipython>=4.0.0->ipykernel->jupyter->d2l==0.14) (0.8.1)\n",
            "Requirement already satisfied: setuptools>=18.5 in /usr/local/lib/python3.7/dist-packages (from ipython>=4.0.0->ipykernel->jupyter->d2l==0.14) (57.4.0)\n",
            "Requirement already satisfied: prompt-toolkit<2.0.0,>=1.0.4 in /usr/local/lib/python3.7/dist-packages (from ipython>=4.0.0->ipykernel->jupyter->d2l==0.14) (1.0.18)\n",
            "Requirement already satisfied: pickleshare in /usr/local/lib/python3.7/dist-packages (from ipython>=4.0.0->ipykernel->jupyter->d2l==0.14) (0.7.5)\n",
            "Requirement already satisfied: pexpect in /usr/local/lib/python3.7/dist-packages (from ipython>=4.0.0->ipykernel->jupyter->d2l==0.14) (4.8.0)\n",
            "Requirement already satisfied: six>=1.9.0 in /usr/local/lib/python3.7/dist-packages (from prompt-toolkit<2.0.0,>=1.0.4->ipython>=4.0.0->ipykernel->jupyter->d2l==0.14) (1.15.0)\n",
            "Requirement already satisfied: wcwidth in /usr/local/lib/python3.7/dist-packages (from prompt-toolkit<2.0.0,>=1.0.4->ipython>=4.0.0->ipykernel->jupyter->d2l==0.14) (0.2.5)\n",
            "Requirement already satisfied: jupyterlab-widgets>=1.0.0 in /usr/local/lib/python3.7/dist-packages (from ipywidgets->jupyter->d2l==0.14) (1.0.2)\n",
            "Requirement already satisfied: widgetsnbextension~=3.5.0 in /usr/local/lib/python3.7/dist-packages (from ipywidgets->jupyter->d2l==0.14) (3.5.2)\n",
            "Requirement already satisfied: ipython-genutils~=0.2.0 in /usr/local/lib/python3.7/dist-packages (from ipywidgets->jupyter->d2l==0.14) (0.2.0)\n",
            "Requirement already satisfied: nbformat>=4.2.0 in /usr/local/lib/python3.7/dist-packages (from ipywidgets->jupyter->d2l==0.14) (5.1.3)\n",
            "Requirement already satisfied: jsonschema!=2.5.0,>=2.4 in /usr/local/lib/python3.7/dist-packages (from nbformat>=4.2.0->ipywidgets->jupyter->d2l==0.14) (4.3.3)\n",
            "Requirement already satisfied: jupyter-core in /usr/local/lib/python3.7/dist-packages (from nbformat>=4.2.0->ipywidgets->jupyter->d2l==0.14) (4.9.2)\n",
            "Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /usr/local/lib/python3.7/dist-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.2.0->ipywidgets->jupyter->d2l==0.14) (0.18.1)\n",
            "Requirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.2.0->ipywidgets->jupyter->d2l==0.14) (4.11.1)\n",
            "Requirement already satisfied: attrs>=17.4.0 in /usr/local/lib/python3.7/dist-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.2.0->ipywidgets->jupyter->d2l==0.14) (21.4.0)\n",
            "Requirement already satisfied: importlib-resources>=1.4.0 in /usr/local/lib/python3.7/dist-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.2.0->ipywidgets->jupyter->d2l==0.14) (5.4.0)\n",
            "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from jsonschema!=2.5.0,>=2.4->nbformat>=4.2.0->ipywidgets->jupyter->d2l==0.14) (3.10.0.2)\n",
            "Requirement already satisfied: zipp>=3.1.0 in /usr/local/lib/python3.7/dist-packages (from importlib-resources>=1.4.0->jsonschema!=2.5.0,>=2.4->nbformat>=4.2.0->ipywidgets->jupyter->d2l==0.14) (3.7.0)\n",
            "Requirement already satisfied: jinja2 in /usr/local/lib/python3.7/dist-packages (from notebook->jupyter->d2l==0.14) (2.11.3)\n",
            "Requirement already satisfied: terminado>=0.8.1 in /usr/local/lib/python3.7/dist-packages (from notebook->jupyter->d2l==0.14) (0.13.1)\n",
            "Requirement already satisfied: Send2Trash in /usr/local/lib/python3.7/dist-packages (from notebook->jupyter->d2l==0.14) (1.8.0)\n",
            "Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.7/dist-packages (from jupyter-client->ipykernel->jupyter->d2l==0.14) (2.8.2)\n",
            "Requirement already satisfied: pyzmq>=13 in /usr/local/lib/python3.7/dist-packages (from jupyter-client->ipykernel->jupyter->d2l==0.14) (22.3.0)\n",
            "Requirement already satisfied: ptyprocess in /usr/local/lib/python3.7/dist-packages (from terminado>=0.8.1->notebook->jupyter->d2l==0.14) (0.7.0)\n",
            "Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from jinja2->notebook->jupyter->d2l==0.14) (2.0.1)\n",
            "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->d2l==0.14) (1.3.2)\n",
            "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.7/dist-packages (from matplotlib->d2l==0.14) (0.11.0)\n",
            "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->d2l==0.14) (3.0.7)\n",
            "Requirement already satisfied: mistune<2,>=0.8.1 in /usr/local/lib/python3.7/dist-packages (from nbconvert->jupyter->d2l==0.14) (0.8.4)\n",
            "Requirement already satisfied: defusedxml in /usr/local/lib/python3.7/dist-packages (from nbconvert->jupyter->d2l==0.14) (0.7.1)\n",
            "Requirement already satisfied: pandocfilters>=1.4.1 in /usr/local/lib/python3.7/dist-packages (from nbconvert->jupyter->d2l==0.14) (1.5.0)\n",
            "Requirement already satisfied: testpath in /usr/local/lib/python3.7/dist-packages (from nbconvert->jupyter->d2l==0.14) (0.5.0)\n",
            "Requirement already satisfied: entrypoints>=0.2.2 in /usr/local/lib/python3.7/dist-packages (from nbconvert->jupyter->d2l==0.14) (0.4)\n",
            "Requirement already satisfied: bleach in /usr/local/lib/python3.7/dist-packages (from nbconvert->jupyter->d2l==0.14) (4.1.0)\n",
            "Requirement already satisfied: packaging in /usr/local/lib/python3.7/dist-packages (from bleach->nbconvert->jupyter->d2l==0.14) (21.3)\n",
            "Requirement already satisfied: webencodings in /usr/local/lib/python3.7/dist-packages (from bleach->nbconvert->jupyter->d2l==0.14) (0.5.1)\n",
            "Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas->d2l==0.14) (2018.9)\n",
            "Requirement already satisfied: qtpy in /usr/local/lib/python3.7/dist-packages (from qtconsole->jupyter->d2l==0.14) (2.0.1)\n",
            "Installing collected packages: d2l\n",
            "Successfully installed d2l-0.14.0\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "import os \n",
        "import torch\n",
        "from d2l import torch as d2l"
      ],
      "metadata": {
        "id": "f255eLvgei3-"
      },
      "execution_count": 10,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "##下载和预处理数据集\n",
        "Tatoeba项目的双语句子对组成的“英-法”数据集，数据集中的每一行都是制表符分隔的文本序列对"
      ],
      "metadata": {
        "id": "O6qBx3Xmgghh"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "d2l.DATA_HUB['fra-eng'] = (d2l.DATA_URL + 'fra-eng.zip', '94646ad1522d915e7b0f9296181140edcf86a4f5')\n",
        "\n",
        "def read_data_nmt():\n",
        "    \"\"\"载入 \"英语-法语\" 数据集\"\"\"\n",
        "    data_dir = d2l.download_extract('fra-eng')\n",
        "    with open(os.path.join(data_dir,'fra.txt'), 'r', encoding='utf-8') as f:\n",
        "        return f.read()\n",
        "\n",
        "raw_text = read_data_nmt()\n",
        "print(raw_text[:75])"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "ZD0NjCyvcwNw",
        "outputId": "71056472-1b91-4ddc-b46f-51593c81f5bd"
      },
      "execution_count": 11,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Downloading ../data/fra-eng.zip from http://d2l-data.s3-accelerate.amazonaws.com/fra-eng.zip...\n",
            "Go.\tVa !\n",
            "Hi.\tSalut !\n",
            "Run!\tCours !\n",
            "Run!\tCourez !\n",
            "Who?\tQui ?\n",
            "Wow!\tÇa alors !\n",
            "\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "def preprocess_nmt(text):\n",
        "    \"\"\"预处理 \"英语-法语\" 数据集\"\"\"\n",
        "    def no_space(char, prev_char):\n",
        "        return char in set(',.!?') and prev_char != ' '\n",
        "\n",
        "    # 使用空格替换不间断空格\n",
        "    # 使用小写字母替换大写字母\n",
        "    text = text.replace('\\u202f', ' ').replace('\\xa0', ' ').lower()\n",
        "    # 在单词和标点符号之间插入空格\n",
        "    out = [' ' + char if i > 0 and no_space(char, text[i - 1]) else char for i,char in enumerate(text)]\n",
        "    return ''.join(out)\n",
        "\n",
        "text = preprocess_nmt(raw_text)\n",
        "print(text[:80])"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "_A_cSE-ud-M4",
        "outputId": "1768fb9b-607c-4e0d-c8db-bb176ff4170f"
      },
      "execution_count": 12,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "go .\tva !\n",
            "hi .\tsalut !\n",
            "run !\tcours !\n",
            "run !\tcourez !\n",
            "who ?\tqui ?\n",
            "wow !\tça alors !\n"
          ]
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "##词元化"
      ],
      "metadata": {
        "id": "dsLOTMPIg6EW"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "def tokenize_nmt(text, num_examples=None):\n",
        "    \"\"\"词元化 \"英语-法语\" 数据数据集\"\"\"\n",
        "    source, target = [], []\n",
        "    for i, line in enumerate(text.split('\\n')):\n",
        "        if num_examples and i > num_examples:\n",
        "            break\n",
        "        parts = line.split('\\t')\n",
        "        if len(parts) == 2:\n",
        "            source.append(parts[0].split(' '))\n",
        "            target.append(parts[1].split(' '))\n",
        "    return source, target\n",
        "\n",
        "source, target = tokenize_nmt(text)"
      ],
      "metadata": {
        "id": "tmA3VhfrgaoZ"
      },
      "execution_count": 13,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "source[:6], target[:6]"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "bKYKK24rh-xA",
        "outputId": "9145945e-81ac-44b9-c132-227dfd287ce4"
      },
      "execution_count": 14,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "([['go', '.'],\n",
              "  ['hi', '.'],\n",
              "  ['run', '!'],\n",
              "  ['run', '!'],\n",
              "  ['who', '?'],\n",
              "  ['wow', '!']],\n",
              " [['va', '!'],\n",
              "  ['salut', '!'],\n",
              "  ['cours', '!'],\n",
              "  ['courez', '!'],\n",
              "  ['qui', '?'],\n",
              "  ['ça', 'alors', '!']])"
            ]
          },
          "metadata": {},
          "execution_count": 14
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "def show_list_len_pair_hist(legend, xlabel, ylabel, xlist, ylist):\n",
        "    \"\"\"绘制列表长度对的直方图\"\"\"\n",
        "    d2l.set_figsize()\n",
        "    _,_,patches = d2l.plt.hist([[len(l) for l in xlist], [len(l) for l in ylist]])\n",
        "    d2l.plt.xlabel(xlabel)\n",
        "    d2l.plt.ylabel(ylabel)\n",
        "    for patch in patches[1].patches:\n",
        "        patch.set_hatch('/')\n",
        "    d2l.plt.legend(legend)\n",
        "\n",
        "show_list_len_pair_hist(['source','target'],'#tokens per sequence','count',source,target);"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 262
        },
        "id": "NCkfXYCfiCHu",
        "outputId": "f073d017-4cc5-4b15-985c-c04ca49c1831"
      },
      "execution_count": 15,
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<Figure size 252x180 with 1 Axes>"
            ],
            "image/svg+xml": "<?xml version=\"1.0\" encoding=\"utf-8\" standalone=\"no\"?>\n<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n  \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n<!-- Created with matplotlib (https://matplotlib.org/) -->\n<svg height=\"180.65625pt\" version=\"1.1\" viewBox=\"0 0 274.320356 180.65625\" width=\"274.320356pt\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n <defs>\n  <style type=\"text/css\">\n*{stroke-linecap:butt;stroke-linejoin:round;}\n  </style>\n </defs>\n <g id=\"figure_1\">\n  <g id=\"patch_1\">\n   <path d=\"M 0 180.65625 \nL 274.320356 180.65625 \nL 274.320356 0 \nL 0 0 \nz\n\" style=\"fill:none;\"/>\n  </g>\n  <g id=\"axes_1\">\n   <g id=\"patch_2\">\n    <path d=\"M 66.053125 143.1 \nL 261.353125 143.1 \nL 261.353125 7.2 \nL 66.053125 7.2 \nz\n\" style=\"fill:#ffffff;\"/>\n   </g>\n   <g id=\"patch_3\">\n    <path clip-path=\"url(#pff4c7d2db4)\" d=\"M 74.930398 143.1 \nL 82.177151 143.1 \nL 82.177151 13.671429 \nL 74.930398 13.671429 \nz\n\" style=\"fill:#1f77b4;\"/>\n   </g>\n   <g id=\"patch_4\">\n    <path clip-path=\"url(#pff4c7d2db4)\" d=\"M 93.047281 143.1 \nL 100.294034 143.1 \nL 100.294034 69.112838 \nL 93.047281 69.112838 \nz\n\" style=\"fill:#1f77b4;\"/>\n   </g>\n   <g id=\"patch_5\">\n    <path clip-path=\"url(#pff4c7d2db4)\" d=\"M 111.164164 143.1 \nL 118.410917 143.1 \nL 118.410917 138.560551 \nL 111.164164 138.560551 \nz\n\" style=\"fill:#1f77b4;\"/>\n   </g>\n   <g id=\"patch_6\">\n    <path clip-path=\"url(#pff4c7d2db4)\" d=\"M 129.281047 143.1 \nL 136.5278 143.1 \nL 136.5278 142.652168 \nL 129.281047 142.652168 \nz\n\" style=\"fill:#1f77b4;\"/>\n   </g>\n   <g id=\"patch_7\">\n    <path clip-path=\"url(#pff4c7d2db4)\" d=\"M 147.39793 143.1 \nL 154.644683 143.1 \nL 154.644683 143.045112 \nL 147.39793 143.045112 \nz\n\" style=\"fill:#1f77b4;\"/>\n   </g>\n   <g id=\"patch_8\">\n    <path clip-path=\"url(#pff4c7d2db4)\" d=\"M 165.514813 143.1 \nL 172.761567 143.1 \nL 172.761567 143.083783 \nL 165.514813 143.083783 \nz\n\" style=\"fill:#1f77b4;\"/>\n   </g>\n   <g id=\"patch_9\">\n    <path clip-path=\"url(#pff4c7d2db4)\" d=\"M 183.631696 143.1 \nL 190.87845 143.1 \nL 190.87845 143.092515 \nL 183.631696 143.092515 \nz\n\" style=\"fill:#1f77b4;\"/>\n   </g>\n   <g id=\"patch_10\">\n    <path clip-path=\"url(#pff4c7d2db4)\" d=\"M 201.74858 143.1 \nL 208.995333 143.1 \nL 208.995333 143.098753 \nL 201.74858 143.098753 \nz\n\" style=\"fill:#1f77b4;\"/>\n   </g>\n   <g id=\"patch_11\">\n    <path clip-path=\"url(#pff4c7d2db4)\" d=\"M 219.865463 143.1 \nL 227.112216 143.1 \nL 227.112216 143.097505 \nL 219.865463 143.097505 \nz\n\" style=\"fill:#1f77b4;\"/>\n   </g>\n   <g id=\"patch_12\">\n    <path clip-path=\"url(#pff4c7d2db4)\" d=\"M 237.982346 143.1 \nL 245.229099 143.1 \nL 245.229099 143.1 \nL 237.982346 143.1 \nz\n\" style=\"fill:#1f77b4;\"/>\n   </g>\n   <g id=\"patch_13\">\n    <path clip-path=\"url(#pff4c7d2db4)\" d=\"M 82.177151 143.1 \nL 89.423904 143.1 \nL 89.423904 26.397854 \nL 82.177151 26.397854 \nz\n\" style=\"fill:url(#hd74d4c9235);\"/>\n   </g>\n   <g id=\"patch_14\">\n    <path clip-path=\"url(#pff4c7d2db4)\" d=\"M 100.294034 143.1 \nL 107.540787 143.1 \nL 107.540787 59.453878 \nL 100.294034 59.453878 \nz\n\" style=\"fill:url(#hd74d4c9235);\"/>\n   </g>\n   <g id=\"patch_15\">\n    <path clip-path=\"url(#pff4c7d2db4)\" d=\"M 118.410917 143.1 \nL 125.65767 143.1 \nL 125.65767 136.044456 \nL 118.410917 136.044456 \nz\n\" style=\"fill:url(#hd74d4c9235);\"/>\n   </g>\n   <g id=\"patch_16\">\n    <path clip-path=\"url(#pff4c7d2db4)\" d=\"M 136.5278 143.1 \nL 143.774554 143.1 \nL 143.774554 142.176891 \nL 136.5278 142.176891 \nz\n\" style=\"fill:url(#hd74d4c9235);\"/>\n   </g>\n   <g id=\"patch_17\">\n    <path clip-path=\"url(#pff4c7d2db4)\" d=\"M 154.644683 143.1 \nL 161.891437 143.1 \nL 161.891437 142.993967 \nL 154.644683 142.993967 \nz\n\" style=\"fill:url(#hd74d4c9235);\"/>\n   </g>\n   <g id=\"patch_18\">\n    <path clip-path=\"url(#pff4c7d2db4)\" d=\"M 172.761567 143.1 \nL 180.00832 143.1 \nL 180.00832 143.058834 \nL 172.761567 143.058834 \nz\n\" style=\"fill:url(#hd74d4c9235);\"/>\n   </g>\n   <g id=\"patch_19\">\n    <path clip-path=\"url(#pff4c7d2db4)\" d=\"M 190.87845 143.1 \nL 198.125203 143.1 \nL 198.125203 143.097505 \nL 190.87845 143.097505 \nz\n\" style=\"fill:url(#hd74d4c9235);\"/>\n   </g>\n   <g id=\"patch_20\">\n    <path clip-path=\"url(#pff4c7d2db4)\" d=\"M 208.995333 143.1 \nL 216.242086 143.1 \nL 216.242086 143.097505 \nL 208.995333 143.097505 \nz\n\" style=\"fill:url(#hd74d4c9235);\"/>\n   </g>\n   <g id=\"patch_21\">\n    <path clip-path=\"url(#pff4c7d2db4)\" d=\"M 227.112216 143.1 \nL 234.358969 143.1 \nL 234.358969 143.097505 \nL 227.112216 143.097505 \nz\n\" style=\"fill:url(#hd74d4c9235);\"/>\n   </g>\n   <g id=\"patch_22\">\n    <path clip-path=\"url(#pff4c7d2db4)\" d=\"M 245.229099 143.1 \nL 252.475852 143.1 \nL 252.475852 143.096258 \nL 245.229099 143.096258 \nz\n\" style=\"fill:url(#hd74d4c9235);\"/>\n   </g>\n   <g id=\"matplotlib.axis_1\">\n    <g id=\"xtick_1\">\n     <g id=\"line2d_1\">\n      <defs>\n       <path d=\"M 0 0 \nL 0 3.5 \n\" id=\"me63f2f1450\" style=\"stroke:#000000;stroke-width:0.8;\"/>\n      </defs>\n      <g>\n       <use style=\"stroke:#000000;stroke-width:0.8;\" x=\"66.648394\" xlink:href=\"#me63f2f1450\" y=\"143.1\"/>\n      </g>\n     </g>\n     <g id=\"text_1\">\n      <!-- 0 -->\n      <defs>\n       <path d=\"M 31.78125 66.40625 \nQ 24.171875 66.40625 20.328125 58.90625 \nQ 16.5 51.421875 16.5 36.375 \nQ 16.5 21.390625 20.328125 13.890625 \nQ 24.171875 6.390625 31.78125 6.390625 \nQ 39.453125 6.390625 43.28125 13.890625 \nQ 47.125 21.390625 47.125 36.375 \nQ 47.125 51.421875 43.28125 58.90625 \nQ 39.453125 66.40625 31.78125 66.40625 \nz\nM 31.78125 74.21875 \nQ 44.046875 74.21875 50.515625 64.515625 \nQ 56.984375 54.828125 56.984375 36.375 \nQ 56.984375 17.96875 50.515625 8.265625 \nQ 44.046875 -1.421875 31.78125 -1.421875 \nQ 19.53125 -1.421875 13.0625 8.265625 \nQ 6.59375 17.96875 6.59375 36.375 \nQ 6.59375 54.828125 13.0625 64.515625 \nQ 19.53125 74.21875 31.78125 74.21875 \nz\n\" id=\"DejaVuSans-48\"/>\n      </defs>\n      <g transform=\"translate(63.467144 157.698438)scale(0.1 -0.1)\">\n       <use xlink:href=\"#DejaVuSans-48\"/>\n      </g>\n     </g>\n    </g>\n    <g id=\"xtick_2\">\n     <g id=\"line2d_2\">\n      <g>\n       <use style=\"stroke:#000000;stroke-width:0.8;\" x=\"131.351548\" xlink:href=\"#me63f2f1450\" y=\"143.1\"/>\n      </g>\n     </g>\n     <g id=\"text_2\">\n      <!-- 20 -->\n      <defs>\n       <path d=\"M 19.1875 8.296875 \nL 53.609375 8.296875 \nL 53.609375 0 \nL 7.328125 0 \nL 7.328125 8.296875 \nQ 12.9375 14.109375 22.625 23.890625 \nQ 32.328125 33.6875 34.8125 36.53125 \nQ 39.546875 41.84375 41.421875 45.53125 \nQ 43.3125 49.21875 43.3125 52.78125 \nQ 43.3125 58.59375 39.234375 62.25 \nQ 35.15625 65.921875 28.609375 65.921875 \nQ 23.96875 65.921875 18.8125 64.3125 \nQ 13.671875 62.703125 7.8125 59.421875 \nL 7.8125 69.390625 \nQ 13.765625 71.78125 18.9375 73 \nQ 24.125 74.21875 28.421875 74.21875 \nQ 39.75 74.21875 46.484375 68.546875 \nQ 53.21875 62.890625 53.21875 53.421875 \nQ 53.21875 48.921875 51.53125 44.890625 \nQ 49.859375 40.875 45.40625 35.40625 \nQ 44.1875 33.984375 37.640625 27.21875 \nQ 31.109375 20.453125 19.1875 8.296875 \nz\n\" id=\"DejaVuSans-50\"/>\n      </defs>\n      <g transform=\"translate(124.989048 157.698438)scale(0.1 -0.1)\">\n       <use xlink:href=\"#DejaVuSans-50\"/>\n       <use x=\"63.623047\" xlink:href=\"#DejaVuSans-48\"/>\n      </g>\n     </g>\n    </g>\n    <g id=\"xtick_3\">\n     <g id=\"line2d_3\">\n      <g>\n       <use style=\"stroke:#000000;stroke-width:0.8;\" x=\"196.054702\" xlink:href=\"#me63f2f1450\" y=\"143.1\"/>\n      </g>\n     </g>\n     <g id=\"text_3\">\n      <!-- 40 -->\n      <defs>\n       <path d=\"M 37.796875 64.3125 \nL 12.890625 25.390625 \nL 37.796875 25.390625 \nz\nM 35.203125 72.90625 \nL 47.609375 72.90625 \nL 47.609375 25.390625 \nL 58.015625 25.390625 \nL 58.015625 17.1875 \nL 47.609375 17.1875 \nL 47.609375 0 \nL 37.796875 0 \nL 37.796875 17.1875 \nL 4.890625 17.1875 \nL 4.890625 26.703125 \nz\n\" id=\"DejaVuSans-52\"/>\n      </defs>\n      <g transform=\"translate(189.692202 157.698438)scale(0.1 -0.1)\">\n       <use xlink:href=\"#DejaVuSans-52\"/>\n       <use x=\"63.623047\" xlink:href=\"#DejaVuSans-48\"/>\n      </g>\n     </g>\n    </g>\n    <g id=\"xtick_4\">\n     <g id=\"line2d_4\">\n      <g>\n       <use style=\"stroke:#000000;stroke-width:0.8;\" x=\"260.757856\" xlink:href=\"#me63f2f1450\" y=\"143.1\"/>\n      </g>\n     </g>\n     <g id=\"text_4\">\n      <!-- 60 -->\n      <defs>\n       <path d=\"M 33.015625 40.375 \nQ 26.375 40.375 22.484375 35.828125 \nQ 18.609375 31.296875 18.609375 23.390625 \nQ 18.609375 15.53125 22.484375 10.953125 \nQ 26.375 6.390625 33.015625 6.390625 \nQ 39.65625 6.390625 43.53125 10.953125 \nQ 47.40625 15.53125 47.40625 23.390625 \nQ 47.40625 31.296875 43.53125 35.828125 \nQ 39.65625 40.375 33.015625 40.375 \nz\nM 52.59375 71.296875 \nL 52.59375 62.3125 \nQ 48.875 64.0625 45.09375 64.984375 \nQ 41.3125 65.921875 37.59375 65.921875 \nQ 27.828125 65.921875 22.671875 59.328125 \nQ 17.53125 52.734375 16.796875 39.40625 \nQ 19.671875 43.65625 24.015625 45.921875 \nQ 28.375 48.1875 33.59375 48.1875 \nQ 44.578125 48.1875 50.953125 41.515625 \nQ 57.328125 34.859375 57.328125 23.390625 \nQ 57.328125 12.15625 50.6875 5.359375 \nQ 44.046875 -1.421875 33.015625 -1.421875 \nQ 20.359375 -1.421875 13.671875 8.265625 \nQ 6.984375 17.96875 6.984375 36.375 \nQ 6.984375 53.65625 15.1875 63.9375 \nQ 23.390625 74.21875 37.203125 74.21875 \nQ 40.921875 74.21875 44.703125 73.484375 \nQ 48.484375 72.75 52.59375 71.296875 \nz\n\" id=\"DejaVuSans-54\"/>\n      </defs>\n      <g transform=\"translate(254.395356 157.698438)scale(0.1 -0.1)\">\n       <use xlink:href=\"#DejaVuSans-54\"/>\n       <use x=\"63.623047\" xlink:href=\"#DejaVuSans-48\"/>\n      </g>\n     </g>\n    </g>\n    <g id=\"text_5\">\n     <!-- #tokens per sequence -->\n     <defs>\n      <path d=\"M 51.125 44 \nL 36.921875 44 \nL 32.8125 27.6875 \nL 47.125 27.6875 \nz\nM 43.796875 71.78125 \nL 38.71875 51.515625 \nL 52.984375 51.515625 \nL 58.109375 71.78125 \nL 65.921875 71.78125 \nL 60.890625 51.515625 \nL 76.125 51.515625 \nL 76.125 44 \nL 58.984375 44 \nL 54.984375 27.6875 \nL 70.515625 27.6875 \nL 70.515625 20.21875 \nL 53.078125 20.21875 \nL 48 0 \nL 40.1875 0 \nL 45.21875 20.21875 \nL 30.90625 20.21875 \nL 25.875 0 \nL 18.015625 0 \nL 23.09375 20.21875 \nL 7.71875 20.21875 \nL 7.71875 27.6875 \nL 24.90625 27.6875 \nL 29 44 \nL 13.28125 44 \nL 13.28125 51.515625 \nL 30.90625 51.515625 \nL 35.890625 71.78125 \nz\n\" id=\"DejaVuSans-35\"/>\n      <path d=\"M 18.3125 70.21875 \nL 18.3125 54.6875 \nL 36.8125 54.6875 \nL 36.8125 47.703125 \nL 18.3125 47.703125 \nL 18.3125 18.015625 \nQ 18.3125 11.328125 20.140625 9.421875 \nQ 21.96875 7.515625 27.59375 7.515625 \nL 36.8125 7.515625 \nL 36.8125 0 \nL 27.59375 0 \nQ 17.1875 0 13.234375 3.875 \nQ 9.28125 7.765625 9.28125 18.015625 \nL 9.28125 47.703125 \nL 2.6875 47.703125 \nL 2.6875 54.6875 \nL 9.28125 54.6875 \nL 9.28125 70.21875 \nz\n\" id=\"DejaVuSans-116\"/>\n      <path d=\"M 30.609375 48.390625 \nQ 23.390625 48.390625 19.1875 42.75 \nQ 14.984375 37.109375 14.984375 27.296875 \nQ 14.984375 17.484375 19.15625 11.84375 \nQ 23.34375 6.203125 30.609375 6.203125 \nQ 37.796875 6.203125 41.984375 11.859375 \nQ 46.1875 17.53125 46.1875 27.296875 \nQ 46.1875 37.015625 41.984375 42.703125 \nQ 37.796875 48.390625 30.609375 48.390625 \nz\nM 30.609375 56 \nQ 42.328125 56 49.015625 48.375 \nQ 55.71875 40.765625 55.71875 27.296875 \nQ 55.71875 13.875 49.015625 6.21875 \nQ 42.328125 -1.421875 30.609375 -1.421875 \nQ 18.84375 -1.421875 12.171875 6.21875 \nQ 5.515625 13.875 5.515625 27.296875 \nQ 5.515625 40.765625 12.171875 48.375 \nQ 18.84375 56 30.609375 56 \nz\n\" id=\"DejaVuSans-111\"/>\n      <path d=\"M 9.078125 75.984375 \nL 18.109375 75.984375 \nL 18.109375 31.109375 \nL 44.921875 54.6875 \nL 56.390625 54.6875 \nL 27.390625 29.109375 \nL 57.625 0 \nL 45.90625 0 \nL 18.109375 26.703125 \nL 18.109375 0 \nL 9.078125 0 \nz\n\" id=\"DejaVuSans-107\"/>\n      <path d=\"M 56.203125 29.59375 \nL 56.203125 25.203125 \nL 14.890625 25.203125 \nQ 15.484375 15.921875 20.484375 11.0625 \nQ 25.484375 6.203125 34.421875 6.203125 \nQ 39.59375 6.203125 44.453125 7.46875 \nQ 49.3125 8.734375 54.109375 11.28125 \nL 54.109375 2.78125 \nQ 49.265625 0.734375 44.1875 -0.34375 \nQ 39.109375 -1.421875 33.890625 -1.421875 \nQ 20.796875 -1.421875 13.15625 6.1875 \nQ 5.515625 13.8125 5.515625 26.8125 \nQ 5.515625 40.234375 12.765625 48.109375 \nQ 20.015625 56 32.328125 56 \nQ 43.359375 56 49.78125 48.890625 \nQ 56.203125 41.796875 56.203125 29.59375 \nz\nM 47.21875 32.234375 \nQ 47.125 39.59375 43.09375 43.984375 \nQ 39.0625 48.390625 32.421875 48.390625 \nQ 24.90625 48.390625 20.390625 44.140625 \nQ 15.875 39.890625 15.1875 32.171875 \nz\n\" id=\"DejaVuSans-101\"/>\n      <path d=\"M 54.890625 33.015625 \nL 54.890625 0 \nL 45.90625 0 \nL 45.90625 32.71875 \nQ 45.90625 40.484375 42.875 44.328125 \nQ 39.84375 48.1875 33.796875 48.1875 \nQ 26.515625 48.1875 22.3125 43.546875 \nQ 18.109375 38.921875 18.109375 30.90625 \nL 18.109375 0 \nL 9.078125 0 \nL 9.078125 54.6875 \nL 18.109375 54.6875 \nL 18.109375 46.1875 \nQ 21.34375 51.125 25.703125 53.5625 \nQ 30.078125 56 35.796875 56 \nQ 45.21875 56 50.046875 50.171875 \nQ 54.890625 44.34375 54.890625 33.015625 \nz\n\" id=\"DejaVuSans-110\"/>\n      <path d=\"M 44.28125 53.078125 \nL 44.28125 44.578125 \nQ 40.484375 46.53125 36.375 47.5 \nQ 32.28125 48.484375 27.875 48.484375 \nQ 21.1875 48.484375 17.84375 46.4375 \nQ 14.5 44.390625 14.5 40.28125 \nQ 14.5 37.15625 16.890625 35.375 \nQ 19.28125 33.59375 26.515625 31.984375 \nL 29.59375 31.296875 \nQ 39.15625 29.25 43.1875 25.515625 \nQ 47.21875 21.78125 47.21875 15.09375 \nQ 47.21875 7.46875 41.1875 3.015625 \nQ 35.15625 -1.421875 24.609375 -1.421875 \nQ 20.21875 -1.421875 15.453125 -0.5625 \nQ 10.6875 0.296875 5.421875 2 \nL 5.421875 11.28125 \nQ 10.40625 8.6875 15.234375 7.390625 \nQ 20.0625 6.109375 24.8125 6.109375 \nQ 31.15625 6.109375 34.5625 8.28125 \nQ 37.984375 10.453125 37.984375 14.40625 \nQ 37.984375 18.0625 35.515625 20.015625 \nQ 33.0625 21.96875 24.703125 23.78125 \nL 21.578125 24.515625 \nQ 13.234375 26.265625 9.515625 29.90625 \nQ 5.8125 33.546875 5.8125 39.890625 \nQ 5.8125 47.609375 11.28125 51.796875 \nQ 16.75 56 26.8125 56 \nQ 31.78125 56 36.171875 55.265625 \nQ 40.578125 54.546875 44.28125 53.078125 \nz\n\" id=\"DejaVuSans-115\"/>\n      <path id=\"DejaVuSans-32\"/>\n      <path d=\"M 18.109375 8.203125 \nL 18.109375 -20.796875 \nL 9.078125 -20.796875 \nL 9.078125 54.6875 \nL 18.109375 54.6875 \nL 18.109375 46.390625 \nQ 20.953125 51.265625 25.265625 53.625 \nQ 29.59375 56 35.59375 56 \nQ 45.5625 56 51.78125 48.09375 \nQ 58.015625 40.1875 58.015625 27.296875 \nQ 58.015625 14.40625 51.78125 6.484375 \nQ 45.5625 -1.421875 35.59375 -1.421875 \nQ 29.59375 -1.421875 25.265625 0.953125 \nQ 20.953125 3.328125 18.109375 8.203125 \nz\nM 48.6875 27.296875 \nQ 48.6875 37.203125 44.609375 42.84375 \nQ 40.53125 48.484375 33.40625 48.484375 \nQ 26.265625 48.484375 22.1875 42.84375 \nQ 18.109375 37.203125 18.109375 27.296875 \nQ 18.109375 17.390625 22.1875 11.75 \nQ 26.265625 6.109375 33.40625 6.109375 \nQ 40.53125 6.109375 44.609375 11.75 \nQ 48.6875 17.390625 48.6875 27.296875 \nz\n\" id=\"DejaVuSans-112\"/>\n      <path d=\"M 41.109375 46.296875 \nQ 39.59375 47.171875 37.8125 47.578125 \nQ 36.03125 48 33.890625 48 \nQ 26.265625 48 22.1875 43.046875 \nQ 18.109375 38.09375 18.109375 28.8125 \nL 18.109375 0 \nL 9.078125 0 \nL 9.078125 54.6875 \nL 18.109375 54.6875 \nL 18.109375 46.1875 \nQ 20.953125 51.171875 25.484375 53.578125 \nQ 30.03125 56 36.53125 56 \nQ 37.453125 56 38.578125 55.875 \nQ 39.703125 55.765625 41.0625 55.515625 \nz\n\" id=\"DejaVuSans-114\"/>\n      <path d=\"M 14.796875 27.296875 \nQ 14.796875 17.390625 18.875 11.75 \nQ 22.953125 6.109375 30.078125 6.109375 \nQ 37.203125 6.109375 41.296875 11.75 \nQ 45.40625 17.390625 45.40625 27.296875 \nQ 45.40625 37.203125 41.296875 42.84375 \nQ 37.203125 48.484375 30.078125 48.484375 \nQ 22.953125 48.484375 18.875 42.84375 \nQ 14.796875 37.203125 14.796875 27.296875 \nz\nM 45.40625 8.203125 \nQ 42.578125 3.328125 38.25 0.953125 \nQ 33.9375 -1.421875 27.875 -1.421875 \nQ 17.96875 -1.421875 11.734375 6.484375 \nQ 5.515625 14.40625 5.515625 27.296875 \nQ 5.515625 40.1875 11.734375 48.09375 \nQ 17.96875 56 27.875 56 \nQ 33.9375 56 38.25 53.625 \nQ 42.578125 51.265625 45.40625 46.390625 \nL 45.40625 54.6875 \nL 54.390625 54.6875 \nL 54.390625 -20.796875 \nL 45.40625 -20.796875 \nz\n\" id=\"DejaVuSans-113\"/>\n      <path d=\"M 8.5 21.578125 \nL 8.5 54.6875 \nL 17.484375 54.6875 \nL 17.484375 21.921875 \nQ 17.484375 14.15625 20.5 10.265625 \nQ 23.53125 6.390625 29.59375 6.390625 \nQ 36.859375 6.390625 41.078125 11.03125 \nQ 45.3125 15.671875 45.3125 23.6875 \nL 45.3125 54.6875 \nL 54.296875 54.6875 \nL 54.296875 0 \nL 45.3125 0 \nL 45.3125 8.40625 \nQ 42.046875 3.421875 37.71875 1 \nQ 33.40625 -1.421875 27.6875 -1.421875 \nQ 18.265625 -1.421875 13.375 4.4375 \nQ 8.5 10.296875 8.5 21.578125 \nz\nM 31.109375 56 \nz\n\" id=\"DejaVuSans-117\"/>\n      <path d=\"M 48.78125 52.59375 \nL 48.78125 44.1875 \nQ 44.96875 46.296875 41.140625 47.34375 \nQ 37.3125 48.390625 33.40625 48.390625 \nQ 24.65625 48.390625 19.8125 42.84375 \nQ 14.984375 37.3125 14.984375 27.296875 \nQ 14.984375 17.28125 19.8125 11.734375 \nQ 24.65625 6.203125 33.40625 6.203125 \nQ 37.3125 6.203125 41.140625 7.25 \nQ 44.96875 8.296875 48.78125 10.40625 \nL 48.78125 2.09375 \nQ 45.015625 0.34375 40.984375 -0.53125 \nQ 36.96875 -1.421875 32.421875 -1.421875 \nQ 20.0625 -1.421875 12.78125 6.34375 \nQ 5.515625 14.109375 5.515625 27.296875 \nQ 5.515625 40.671875 12.859375 48.328125 \nQ 20.21875 56 33.015625 56 \nQ 37.15625 56 41.109375 55.140625 \nQ 45.0625 54.296875 48.78125 52.59375 \nz\n\" id=\"DejaVuSans-99\"/>\n     </defs>\n     <g transform=\"translate(107.35 171.376563)scale(0.1 -0.1)\">\n      <use xlink:href=\"#DejaVuSans-35\"/>\n      <use x=\"83.789062\" xlink:href=\"#DejaVuSans-116\"/>\n      <use x=\"122.998047\" xlink:href=\"#DejaVuSans-111\"/>\n      <use x=\"184.179688\" xlink:href=\"#DejaVuSans-107\"/>\n      <use x=\"238.464844\" xlink:href=\"#DejaVuSans-101\"/>\n      <use x=\"299.988281\" xlink:href=\"#DejaVuSans-110\"/>\n      <use x=\"363.367188\" xlink:href=\"#DejaVuSans-115\"/>\n      <use x=\"415.466797\" xlink:href=\"#DejaVuSans-32\"/>\n      <use x=\"447.253906\" xlink:href=\"#DejaVuSans-112\"/>\n      <use x=\"510.730469\" xlink:href=\"#DejaVuSans-101\"/>\n      <use x=\"572.253906\" xlink:href=\"#DejaVuSans-114\"/>\n      <use x=\"613.367188\" xlink:href=\"#DejaVuSans-32\"/>\n      <use x=\"645.154297\" xlink:href=\"#DejaVuSans-115\"/>\n      <use x=\"697.253906\" xlink:href=\"#DejaVuSans-101\"/>\n      <use x=\"758.777344\" xlink:href=\"#DejaVuSans-113\"/>\n      <use x=\"822.253906\" xlink:href=\"#DejaVuSans-117\"/>\n      <use x=\"885.632812\" xlink:href=\"#DejaVuSans-101\"/>\n      <use x=\"947.15625\" xlink:href=\"#DejaVuSans-110\"/>\n      <use x=\"1010.535156\" xlink:href=\"#DejaVuSans-99\"/>\n      <use x=\"1065.515625\" xlink:href=\"#DejaVuSans-101\"/>\n     </g>\n    </g>\n   </g>\n   <g id=\"matplotlib.axis_2\">\n    <g id=\"ytick_1\">\n     <g id=\"line2d_5\">\n      <defs>\n       <path d=\"M 0 0 \nL -3.5 0 \n\" id=\"m6f72079ae8\" style=\"stroke:#000000;stroke-width:0.8;\"/>\n      </defs>\n      <g>\n       <use style=\"stroke:#000000;stroke-width:0.8;\" x=\"66.053125\" xlink:href=\"#m6f72079ae8\" y=\"143.1\"/>\n      </g>\n     </g>\n     <g id=\"text_6\">\n      <!-- 0 -->\n      <g transform=\"translate(52.690625 146.899219)scale(0.1 -0.1)\">\n       <use xlink:href=\"#DejaVuSans-48\"/>\n      </g>\n     </g>\n    </g>\n    <g id=\"ytick_2\">\n     <g id=\"line2d_6\">\n      <g>\n       <use style=\"stroke:#000000;stroke-width:0.8;\" x=\"66.053125\" xlink:href=\"#m6f72079ae8\" y=\"118.151116\"/>\n      </g>\n     </g>\n     <g id=\"text_7\">\n      <!-- 20000 -->\n      <g transform=\"translate(27.240625 121.950335)scale(0.1 -0.1)\">\n       <use xlink:href=\"#DejaVuSans-50\"/>\n       <use x=\"63.623047\" xlink:href=\"#DejaVuSans-48\"/>\n       <use x=\"127.246094\" xlink:href=\"#DejaVuSans-48\"/>\n       <use x=\"190.869141\" xlink:href=\"#DejaVuSans-48\"/>\n       <use x=\"254.492188\" xlink:href=\"#DejaVuSans-48\"/>\n      </g>\n     </g>\n    </g>\n    <g id=\"ytick_3\">\n     <g id=\"line2d_7\">\n      <g>\n       <use style=\"stroke:#000000;stroke-width:0.8;\" x=\"66.053125\" xlink:href=\"#m6f72079ae8\" y=\"93.202233\"/>\n      </g>\n     </g>\n     <g id=\"text_8\">\n      <!-- 40000 -->\n      <g transform=\"translate(27.240625 97.001451)scale(0.1 -0.1)\">\n       <use xlink:href=\"#DejaVuSans-52\"/>\n       <use x=\"63.623047\" xlink:href=\"#DejaVuSans-48\"/>\n       <use x=\"127.246094\" xlink:href=\"#DejaVuSans-48\"/>\n       <use x=\"190.869141\" xlink:href=\"#DejaVuSans-48\"/>\n       <use x=\"254.492188\" xlink:href=\"#DejaVuSans-48\"/>\n      </g>\n     </g>\n    </g>\n    <g id=\"ytick_4\">\n     <g id=\"line2d_8\">\n      <g>\n       <use style=\"stroke:#000000;stroke-width:0.8;\" x=\"66.053125\" xlink:href=\"#m6f72079ae8\" y=\"68.253349\"/>\n      </g>\n     </g>\n     <g id=\"text_9\">\n      <!-- 60000 -->\n      <g transform=\"translate(27.240625 72.052568)scale(0.1 -0.1)\">\n       <use xlink:href=\"#DejaVuSans-54\"/>\n       <use x=\"63.623047\" xlink:href=\"#DejaVuSans-48\"/>\n       <use x=\"127.246094\" xlink:href=\"#DejaVuSans-48\"/>\n       <use x=\"190.869141\" xlink:href=\"#DejaVuSans-48\"/>\n       <use x=\"254.492188\" xlink:href=\"#DejaVuSans-48\"/>\n      </g>\n     </g>\n    </g>\n    <g id=\"ytick_5\">\n     <g id=\"line2d_9\">\n      <g>\n       <use style=\"stroke:#000000;stroke-width:0.8;\" x=\"66.053125\" xlink:href=\"#m6f72079ae8\" y=\"43.304465\"/>\n      </g>\n     </g>\n     <g id=\"text_10\">\n      <!-- 80000 -->\n      <defs>\n       <path d=\"M 31.78125 34.625 \nQ 24.75 34.625 20.71875 30.859375 \nQ 16.703125 27.09375 16.703125 20.515625 \nQ 16.703125 13.921875 20.71875 10.15625 \nQ 24.75 6.390625 31.78125 6.390625 \nQ 38.8125 6.390625 42.859375 10.171875 \nQ 46.921875 13.96875 46.921875 20.515625 \nQ 46.921875 27.09375 42.890625 30.859375 \nQ 38.875 34.625 31.78125 34.625 \nz\nM 21.921875 38.8125 \nQ 15.578125 40.375 12.03125 44.71875 \nQ 8.5 49.078125 8.5 55.328125 \nQ 8.5 64.0625 14.71875 69.140625 \nQ 20.953125 74.21875 31.78125 74.21875 \nQ 42.671875 74.21875 48.875 69.140625 \nQ 55.078125 64.0625 55.078125 55.328125 \nQ 55.078125 49.078125 51.53125 44.71875 \nQ 48 40.375 41.703125 38.8125 \nQ 48.828125 37.15625 52.796875 32.3125 \nQ 56.78125 27.484375 56.78125 20.515625 \nQ 56.78125 9.90625 50.3125 4.234375 \nQ 43.84375 -1.421875 31.78125 -1.421875 \nQ 19.734375 -1.421875 13.25 4.234375 \nQ 6.78125 9.90625 6.78125 20.515625 \nQ 6.78125 27.484375 10.78125 32.3125 \nQ 14.796875 37.15625 21.921875 38.8125 \nz\nM 18.3125 54.390625 \nQ 18.3125 48.734375 21.84375 45.5625 \nQ 25.390625 42.390625 31.78125 42.390625 \nQ 38.140625 42.390625 41.71875 45.5625 \nQ 45.3125 48.734375 45.3125 54.390625 \nQ 45.3125 60.0625 41.71875 63.234375 \nQ 38.140625 66.40625 31.78125 66.40625 \nQ 25.390625 66.40625 21.84375 63.234375 \nQ 18.3125 60.0625 18.3125 54.390625 \nz\n\" id=\"DejaVuSans-56\"/>\n      </defs>\n      <g transform=\"translate(27.240625 47.103684)scale(0.1 -0.1)\">\n       <use xlink:href=\"#DejaVuSans-56\"/>\n       <use x=\"63.623047\" xlink:href=\"#DejaVuSans-48\"/>\n       <use x=\"127.246094\" xlink:href=\"#DejaVuSans-48\"/>\n       <use x=\"190.869141\" xlink:href=\"#DejaVuSans-48\"/>\n       <use x=\"254.492188\" xlink:href=\"#DejaVuSans-48\"/>\n      </g>\n     </g>\n    </g>\n    <g id=\"ytick_6\">\n     <g id=\"line2d_10\">\n      <g>\n       <use style=\"stroke:#000000;stroke-width:0.8;\" x=\"66.053125\" xlink:href=\"#m6f72079ae8\" y=\"18.355581\"/>\n      </g>\n     </g>\n     <g id=\"text_11\">\n      <!-- 100000 -->\n      <defs>\n       <path d=\"M 12.40625 8.296875 \nL 28.515625 8.296875 \nL 28.515625 63.921875 \nL 10.984375 60.40625 \nL 10.984375 69.390625 \nL 28.421875 72.90625 \nL 38.28125 72.90625 \nL 38.28125 8.296875 \nL 54.390625 8.296875 \nL 54.390625 0 \nL 12.40625 0 \nz\n\" id=\"DejaVuSans-49\"/>\n      </defs>\n      <g transform=\"translate(20.878125 22.1548)scale(0.1 -0.1)\">\n       <use xlink:href=\"#DejaVuSans-49\"/>\n       <use x=\"63.623047\" xlink:href=\"#DejaVuSans-48\"/>\n       <use x=\"127.246094\" xlink:href=\"#DejaVuSans-48\"/>\n       <use x=\"190.869141\" xlink:href=\"#DejaVuSans-48\"/>\n       <use x=\"254.492188\" xlink:href=\"#DejaVuSans-48\"/>\n       <use x=\"318.115234\" xlink:href=\"#DejaVuSans-48\"/>\n      </g>\n     </g>\n    </g>\n    <g id=\"text_12\">\n     <!-- count -->\n     <g transform=\"translate(14.798438 89.25625)rotate(-90)scale(0.1 -0.1)\">\n      <use xlink:href=\"#DejaVuSans-99\"/>\n      <use x=\"54.980469\" xlink:href=\"#DejaVuSans-111\"/>\n      <use x=\"116.162109\" xlink:href=\"#DejaVuSans-117\"/>\n      <use x=\"179.541016\" xlink:href=\"#DejaVuSans-110\"/>\n      <use x=\"242.919922\" xlink:href=\"#DejaVuSans-116\"/>\n     </g>\n    </g>\n   </g>\n   <g id=\"patch_23\">\n    <path d=\"M 66.053125 143.1 \nL 66.053125 7.2 \n\" style=\"fill:none;stroke:#000000;stroke-linecap:square;stroke-linejoin:miter;stroke-width:0.8;\"/>\n   </g>\n   <g id=\"patch_24\">\n    <path d=\"M 261.353125 143.1 \nL 261.353125 7.2 \n\" style=\"fill:none;stroke:#000000;stroke-linecap:square;stroke-linejoin:miter;stroke-width:0.8;\"/>\n   </g>\n   <g id=\"patch_25\">\n    <path d=\"M 66.053125 143.1 \nL 261.353125 143.1 \n\" style=\"fill:none;stroke:#000000;stroke-linecap:square;stroke-linejoin:miter;stroke-width:0.8;\"/>\n   </g>\n   <g id=\"patch_26\">\n    <path d=\"M 66.053125 7.2 \nL 261.353125 7.2 \n\" style=\"fill:none;stroke:#000000;stroke-linecap:square;stroke-linejoin:miter;stroke-width:0.8;\"/>\n   </g>\n   <g id=\"legend_1\">\n    <g id=\"patch_27\">\n     <path d=\"M 189.15 44.55625 \nL 254.353125 44.55625 \nQ 256.353125 44.55625 256.353125 42.55625 \nL 256.353125 14.2 \nQ 256.353125 12.2 254.353125 12.2 \nL 189.15 12.2 \nQ 187.15 12.2 187.15 14.2 \nL 187.15 42.55625 \nQ 187.15 44.55625 189.15 44.55625 \nz\n\" style=\"fill:#ffffff;opacity:0.8;stroke:#cccccc;stroke-linejoin:miter;\"/>\n    </g>\n    <g id=\"patch_28\">\n     <path d=\"M 191.15 23.798437 \nL 211.15 23.798437 \nL 211.15 16.798437 \nL 191.15 16.798437 \nz\n\" style=\"fill:#1f77b4;\"/>\n    </g>\n    <g id=\"text_13\">\n     <!-- source -->\n     <g transform=\"translate(219.15 23.798437)scale(0.1 -0.1)\">\n      <use xlink:href=\"#DejaVuSans-115\"/>\n      <use x=\"52.099609\" xlink:href=\"#DejaVuSans-111\"/>\n      <use x=\"113.28125\" xlink:href=\"#DejaVuSans-117\"/>\n      <use x=\"176.660156\" xlink:href=\"#DejaVuSans-114\"/>\n      <use x=\"215.523438\" xlink:href=\"#DejaVuSans-99\"/>\n      <use x=\"270.503906\" xlink:href=\"#DejaVuSans-101\"/>\n     </g>\n    </g>\n    <g id=\"patch_29\">\n     <path d=\"M 191.15 38.476562 \nL 211.15 38.476562 \nL 211.15 31.476562 \nL 191.15 31.476562 \nz\n\" style=\"fill:url(#hd74d4c9235);\"/>\n    </g>\n    <g id=\"text_14\">\n     <!-- target -->\n     <defs>\n      <path d=\"M 34.28125 27.484375 \nQ 23.390625 27.484375 19.1875 25 \nQ 14.984375 22.515625 14.984375 16.5 \nQ 14.984375 11.71875 18.140625 8.90625 \nQ 21.296875 6.109375 26.703125 6.109375 \nQ 34.1875 6.109375 38.703125 11.40625 \nQ 43.21875 16.703125 43.21875 25.484375 \nL 43.21875 27.484375 \nz\nM 52.203125 31.203125 \nL 52.203125 0 \nL 43.21875 0 \nL 43.21875 8.296875 \nQ 40.140625 3.328125 35.546875 0.953125 \nQ 30.953125 -1.421875 24.3125 -1.421875 \nQ 15.921875 -1.421875 10.953125 3.296875 \nQ 6 8.015625 6 15.921875 \nQ 6 25.140625 12.171875 29.828125 \nQ 18.359375 34.515625 30.609375 34.515625 \nL 43.21875 34.515625 \nL 43.21875 35.40625 \nQ 43.21875 41.609375 39.140625 45 \nQ 35.0625 48.390625 27.6875 48.390625 \nQ 23 48.390625 18.546875 47.265625 \nQ 14.109375 46.140625 10.015625 43.890625 \nL 10.015625 52.203125 \nQ 14.9375 54.109375 19.578125 55.046875 \nQ 24.21875 56 28.609375 56 \nQ 40.484375 56 46.34375 49.84375 \nQ 52.203125 43.703125 52.203125 31.203125 \nz\n\" id=\"DejaVuSans-97\"/>\n      <path d=\"M 45.40625 27.984375 \nQ 45.40625 37.75 41.375 43.109375 \nQ 37.359375 48.484375 30.078125 48.484375 \nQ 22.859375 48.484375 18.828125 43.109375 \nQ 14.796875 37.75 14.796875 27.984375 \nQ 14.796875 18.265625 18.828125 12.890625 \nQ 22.859375 7.515625 30.078125 7.515625 \nQ 37.359375 7.515625 41.375 12.890625 \nQ 45.40625 18.265625 45.40625 27.984375 \nz\nM 54.390625 6.78125 \nQ 54.390625 -7.171875 48.1875 -13.984375 \nQ 42 -20.796875 29.203125 -20.796875 \nQ 24.46875 -20.796875 20.265625 -20.09375 \nQ 16.0625 -19.390625 12.109375 -17.921875 \nL 12.109375 -9.1875 \nQ 16.0625 -11.328125 19.921875 -12.34375 \nQ 23.78125 -13.375 27.78125 -13.375 \nQ 36.625 -13.375 41.015625 -8.765625 \nQ 45.40625 -4.15625 45.40625 5.171875 \nL 45.40625 9.625 \nQ 42.625 4.78125 38.28125 2.390625 \nQ 33.9375 0 27.875 0 \nQ 17.828125 0 11.671875 7.65625 \nQ 5.515625 15.328125 5.515625 27.984375 \nQ 5.515625 40.671875 11.671875 48.328125 \nQ 17.828125 56 27.875 56 \nQ 33.9375 56 38.28125 53.609375 \nQ 42.625 51.21875 45.40625 46.390625 \nL 45.40625 54.6875 \nL 54.390625 54.6875 \nz\n\" id=\"DejaVuSans-103\"/>\n     </defs>\n     <g transform=\"translate(219.15 38.476562)scale(0.1 -0.1)\">\n      <use xlink:href=\"#DejaVuSans-116\"/>\n      <use x=\"39.208984\" xlink:href=\"#DejaVuSans-97\"/>\n      <use x=\"100.488281\" xlink:href=\"#DejaVuSans-114\"/>\n      <use x=\"139.851562\" xlink:href=\"#DejaVuSans-103\"/>\n      <use x=\"203.328125\" xlink:href=\"#DejaVuSans-101\"/>\n      <use x=\"264.851562\" xlink:href=\"#DejaVuSans-116\"/>\n     </g>\n    </g>\n   </g>\n  </g>\n </g>\n <defs>\n  <clipPath id=\"pff4c7d2db4\">\n   <rect height=\"135.9\" width=\"195.3\" x=\"66.053125\" y=\"7.2\"/>\n  </clipPath>\n </defs>\n <defs>\n  <pattern height=\"72\" id=\"hd74d4c9235\" patternUnits=\"userSpaceOnUse\" width=\"72\" x=\"0\" y=\"0\">\n   <rect fill=\"#ff7f0e\" height=\"73\" width=\"73\" x=\"0\" y=\"0\"/>\n   <path d=\"M -36 36 \nL 36 -36 \nM -24 48 \nL 48 -24 \nM -12 60 \nL 60 -12 \nM 0 72 \nL 72 0 \nM 12 84 \nL 84 12 \nM 24 96 \nL 96 24 \nM 36 108 \nL 108 36 \n\" style=\"fill:#000000;stroke:#000000;stroke-linecap:butt;stroke-linejoin:miter;stroke-width:1.0;\"/>\n  </pattern>\n </defs>\n</svg>\n"
          },
          "metadata": {
            "needs_background": "light"
          }
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "##词表"
      ],
      "metadata": {
        "id": "eG1iSOv0jKL2"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "src_vocab = d2l.Vocab(source, min_freq=2, reserved_tokens=['<pad>', '<bos>', '<eos>'])\n",
        "len(src_vocab)"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "KpMT8RFXjHbA",
        "outputId": "28693aaa-f684-41fb-dcd9-4658b482b627"
      },
      "execution_count": 16,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "10012"
            ]
          },
          "metadata": {},
          "execution_count": 16
        }
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "##加载数据集"
      ],
      "metadata": {
        "id": "THSZVlorjgDQ"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "def truncate_pad(line, num_steps, padding_token):\n",
        "    \"\"\"截断或填充文本序列\"\"\"\n",
        "    if len(line) > num_steps:\n",
        "        return line[:num_steps] # 截断\n",
        "    return line + [padding_token] * (num_steps - len(line)) # 填充\n",
        "\n",
        "truncate_pad(src_vocab[source[0]], 10, src_vocab['<pad>'])"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "05Y0Jr3Aje15",
        "outputId": "83f0773c-4f87-4133-ee3e-09c12e9f0cf8"
      },
      "execution_count": 17,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "[47, 4, 1, 1, 1, 1, 1, 1, 1, 1]"
            ]
          },
          "metadata": {},
          "execution_count": 17
        }
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "def build_array_nmt(lines,vocab,num_steps):\n",
        "    \"\"\"将机器翻译的文本序列转换成小批量\"\"\"\n",
        "    lines = [vocab[l] for l in lines]\n",
        "    lines = [l + [vocab['<eos>']] for l in lines]\n",
        "    array = torch.tensor([truncate_pad(l,num_steps,vocab['<pad>']) for l in lines])\n",
        "    valid_len = (array != vocab['<pad>']).type(torch.int32).sum(1)\n",
        "    return array, valid_len"
      ],
      "metadata": {
        "id": "EixkhqVPkMkC"
      },
      "execution_count": 18,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "##训练模型"
      ],
      "metadata": {
        "id": "-35TzqErlGRc"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "def load_data_nmt(batch_size,num_steps,num_examples=600):\n",
        "    \"\"\"返回翻译数据集的迭代器和词表\"\"\"\n",
        "    text = preprocess_nmt(read_data_nmt())\n",
        "    source, target = tokenize_nmt(text,num_examples)\n",
        "    src_vocab = d2l.Vocab(source,min_freq=2,reserved_tokens=['<pad>','<bos>','<eos>'])\n",
        "    tgt_vocab = d2l.Vocab(target,min_freq=2,reserved_tokens=['<pad>','<bos>','<eos>'])\n",
        "    src_array,src_valid_len = build_array_nmt(source,src_vocab,num_steps)\n",
        "    tgt_array,tgt_valid_len = build_array_nmt(target,tgt_vocab,num_steps)\n",
        "    data_arrays = (src_array,src_valid_len,tgt_array,tgt_valid_len)\n",
        "    data_iter = d2l.load_array(data_arrays,batch_size)\n",
        "    return data_iter,src_vocab,tgt_vocab"
      ],
      "metadata": {
        "id": "ejpKvE2tlFWR"
      },
      "execution_count": 19,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "train_iter,src_vocab,tgt_vocab = load_data_nmt(batch_size=2,num_steps=8)\n",
        "for X,X_valid_len,Y,Y_valid_len in train_iter:\n",
        "    print('X:',X.type(torch.int32))\n",
        "    print('X的有效长度:',X_valid_len)\n",
        "    print('Y:',Y.type(torch.int32))\n",
        "    print('Y的有效长度:',Y_valid_len)\n",
        "    break"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "-qF-R81Mm1kx",
        "outputId": "d74338a9-2984-4ace-92ad-7872576abf7b"
      },
      "execution_count": 20,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "X: tensor([[ 13,  33,   4,   3,   1,   1,   1,   1],\n",
            "        [170,   8,   4,   3,   1,   1,   1,   1]], dtype=torch.int32)\n",
            "X的有效长度: tensor([4, 4])\n",
            "Y: tensor([[187,   4,   3,   1,   1,   1,   1,   1],\n",
            "        [  0,   5,   3,   1,   1,   1,   1,   1]], dtype=torch.int32)\n",
            "Y的有效长度: tensor([3, 3])\n"
          ]
        }
      ]
    }
  ]
}